pacman::p_load(haven, sf, tidyverse, tmap, dplyr, maptiles)
data <- read_dta("/Users/michaelhall/Desktop/school/TZ data/TZ_cleaned_data/N_cluster_size.dta")
# It turns out the gps coordinates were not recorded for some observations!
na_long <- sum( is.na(data$gps_longitude) )
na_lat <- sum( is.na(data$gps_latitude) )
# Filter out NAs
data_clean <- data %>%
filter(
!is.na(gps_longitude),
!is.na(gps_latitude)
)
data_sf <- data_clean |>
st_as_sf(
coords = c("gps_longitude", "gps_latitude"),
crs = 4326,
remove = FALSE
)
data_sf <- st_transform(data_sf, crs = 21037)
cluster_list <- st_is_within_distance(data_sf, data_sf, dist = 100)
# Subtract 1 to exclude the vendor themselves from the cluster
data_sf$cluster_100m <- lengths(cluster_list) - 1
cluster_list <- st_is_within_distance(data_sf, data_sf, dist = 50)
data_sf$cluster_50m <- lengths(cluster_list) - 1
cluster_list <- st_is_within_distance(data_sf, data_sf, dist = 20)
data_sf$cluster_20m <- lengths(cluster_list) - 1
Awesome! We now have measures of clusters. Two additional tasks would be:
For i. we can compare for each vendor as well as ttest to see if the means are different
# Check the names of columns and find the self-reported cluster size of SAME
# food vendors from 2024 (the same year of GPS coordinates)
colnames(data_sf)
## [1] "cluster_size_allfood2015" "cluster_size_samefood2015"
## [3] "cluster_size_allfood2019" "cluster_size_samefood2019"
## [5] "cluster_size_allfood2020" "cluster_size_samefood2020"
## [7] "cluster_size_allfood2021" "cluster_size_samefood2021"
## [9] "cluster_size_allfood2022" "cluster_size_samefood2022"
## [11] "cluster_size_allfood2023" "cluster_size_samefood2023"
## [13] "cluster_size_allfood2024" "cluster_size_samefood2024"
## [15] "gps_latitude" "gps_longitude"
## [17] "gps_precision" "___id"
## [19] "sob" "wgt"
## [21] "geometry" "cluster_100m"
## [23] "cluster_50m" "cluster_20m"
# Check the class of the self-reported cluster size to ensure it is numeric
class(data_sf$cluster_size_samefood2024)
## [1] "numeric"
# Compare the two cluster measures
mean(data_sf$cluster_size_samefood2024 == data_sf$cluster_100m)
## [1] 0.08730873
# They are different! Only 8.7 % of vendors reported the same number of adjacent
# vendors as the 100 meters' calculation.
# An exact number is a high bar. Let's see how many were within 4 vendors.
mean(
abs(data_sf$cluster_size_samefood2024 - data_sf$cluster_100m) <= 4,
na.rm = TRUE
)
## [1] 0.5787579
# Much higher! 58 % of vendors reported a cluster size of vendors within 4 of
# the GPS calculation
# Now ttest to see on the average if the two are different
t.test(
x = data_sf$cluster_size_samefood2024,
y = data_sf$cluster_100m,
paired = TRUE,
alternative = "two.sided"
)
##
## Paired t-test
##
## data: data_sf$cluster_size_samefood2024 and data_sf$cluster_100m
## t = 0.92358, df = 1110, p-value = 0.3559
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.2752927 0.7649417
## sample estimates:
## mean difference
## 0.2448245
# The t-test suggest the two measures are not significantly different! This is
# great news as it will add robustness to using past measures of cluster size
# obtained via recall.
For ii. we begin by creating buffers
buffers_100 <- data_sf |>
st_buffer(dist = 100) |>
mutate(radius = 100, count = cluster_100m)
buffers_50 <- data_sf |>
st_buffer(dist = 50) |>
mutate(radius = 50, count = cluster_50m)
buffers_20 <- data_sf |>
st_buffer(dist = 20) |>
mutate(radius = 20, count = cluster_20m)
buffers_all <- bind_rows(buffers_20, buffers_50, buffers_100)
Then plot a static map.
library(sf)
library(tmap)
# Compute 100 m buffer around each point
buffers_100 <- st_buffer(data_sf, dist = 100)
# Count how many points fall in each buffer
counts <- lengths(st_intersects(buffers_100, data_sf))
buffers_100$cluster_size <- counts
# Plot with bubble size = cluster_size
tmap_mode("plot")
## ℹ tmap mode set to "plot".
tm_basemap("OpenStreetMap") +
tm_shape(buffers_100) +
tm_bubbles(size = "cluster_size",
fill = "lightblue",
col = "darkblue",
lwd = 0.8,
fill_alpha = 0.3,
col_alpha = 0.5,
) +
tm_layout(
frame = FALSE,
inner.margins = c(0,0,0,0)
)
Plot an interactive map
library(sf)
library(tmap)
# 1. Check data class and CRS
print(class(buffers_100))
## [1] "sf" "tbl_df" "tbl" "data.frame"
print(st_crs(buffers_100))
## Coordinate Reference System:
## User input: EPSG:21037
## wkt:
## PROJCRS["Arc 1960 / UTM zone 37S",
## BASEGEOGCRS["Arc 1960",
## DATUM["Arc 1960",
## ELLIPSOID["Clarke 1880 (RGS)",6378249.145,293.465,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4210]],
## CONVERSION["UTM zone 37S",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",0,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",39,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",0.9996,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",500000,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",10000000,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["(E)",east,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["(N)",north,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["Engineering survey, topographic mapping."],
## AREA["Kenya - south of equator and east of 36°E; Tanzania - east of 36°E."],
## BBOX[-11.75,36,0,41.6]],
## ID["EPSG",21037]]
# Transform it to WGS 84 to get it to display properly
buffers_100_ll <- st_transform(buffers_100, crs = 4326)
# 2. Switch to interactive mode
tmap_mode("view")
## ℹ tmap mode set to "view".
# 3. Plot again with bubble size = cluster size (and play with styling)
tm_tiles("OpenStreetMap") +
tm_shape(buffers_100_ll) +
tm_bubbles(size = "cluster_size",
fill = "red",
col = "darkred",
lwd = 1.2,
fill_alpha = 0.3,
col_alpha = 0.5
)
## Registered S3 method overwritten by 'jsonify':
## method from
## print.json jsonlite
All done! We now have a gps-based measure of 2024 cluster size, we’ve shown that street food vendors are able to estimate their cluster within 100m with fairly good accuracy, and we can two different maps to help visualize where clusters are largest within the city.